{
 "cells": [
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "模型训练",
   "id": "200ab1acb6dbd3dd"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-12T03:36:45.364534Z",
     "start_time": "2024-12-12T03:36:35.462498Z"
    }
   },
   "cell_type": "code",
   "source": [
    "from transformers import AutoConfig, AutoModel, AutoTokenizer, pipeline, Trainer, TrainingArguments\n",
    "from datasets import load_dataset\n",
    "model_path = 'E:/model/rbt3'"
   ],
   "id": "904d058b32026536",
   "outputs": [],
   "execution_count": 1
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-12T03:36:53.355927Z",
     "start_time": "2024-12-12T03:36:52.468786Z"
    }
   },
   "cell_type": "code",
   "source": [
    "data = load_dataset(\"csv\", data_files=\"./_test/data.csv\", split=\"train\")\n",
    "data = data.filter(lambda x: x[\"review\"] is not None)\n",
    "# 划分测试数据10%\n",
    "data = data.train_test_split(test_size=0.1)\n"
   ],
   "id": "a0af784e89c5c713",
   "outputs": [],
   "execution_count": 2
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "data",
   "id": "bd88e5035f2784a9",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "数据集预处理",
   "id": "cd53a78ca1998035"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-12T03:36:59.832922Z",
     "start_time": "2024-12-12T03:36:57.105683Z"
    }
   },
   "cell_type": "code",
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(model_path)\n",
    "\n",
    "\n",
    "def process_function(examples):\n",
    "    tokenized_examples = tokenizer(examples[\"review\"], max_length=128, truncation=True)\n",
    "    tokenized_examples[\"labels\"] = examples[\"label\"]\n",
    "    return tokenized_examples\n",
    "\n",
    "\n",
    "tokenized_datasets = data.map(process_function, batched=True, remove_columns=data[\"train\"].column_names)\n",
    "tokenized_datasets"
   ],
   "id": "682d9505896da0de",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Map:   0%|          | 0/6988 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "bee86545c492478182440a9f3f1a895e"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Map:   0%|          | 0/777 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "1b63097018154681b42502e916e94187"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],\n",
       "        num_rows: 6988\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],\n",
       "        num_rows: 777\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 3
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-12T03:40:32.891252Z",
     "start_time": "2024-12-12T03:40:32.874565Z"
    }
   },
   "cell_type": "code",
   "source": "tokenized_datasets['train']",
   "id": "d81b9ddd8c4799a0",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],\n",
       "    num_rows: 6988\n",
       "})"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 14
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "加载模型",
   "id": "9efaa1866a90bab7"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-12T03:37:07.079891Z",
     "start_time": "2024-12-12T03:37:05.782082Z"
    }
   },
   "cell_type": "code",
   "source": [
    "model = AutoModel.from_pretrained(model_path)\n",
    "model.config"
   ],
   "id": "c66a178a0a814b15",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "BertConfig {\n",
       "  \"_attn_implementation_autoset\": true,\n",
       "  \"_name_or_path\": \"E:/model/rbt3\",\n",
       "  \"architectures\": [\n",
       "    \"BertForMaskedLM\"\n",
       "  ],\n",
       "  \"attention_probs_dropout_prob\": 0.1,\n",
       "  \"classifier_dropout\": null,\n",
       "  \"directionality\": \"bidi\",\n",
       "  \"hidden_act\": \"gelu\",\n",
       "  \"hidden_dropout_prob\": 0.1,\n",
       "  \"hidden_size\": 768,\n",
       "  \"initializer_range\": 0.02,\n",
       "  \"intermediate_size\": 3072,\n",
       "  \"layer_norm_eps\": 1e-12,\n",
       "  \"max_position_embeddings\": 512,\n",
       "  \"model_type\": \"bert\",\n",
       "  \"num_attention_heads\": 12,\n",
       "  \"num_hidden_layers\": 3,\n",
       "  \"output_past\": true,\n",
       "  \"pad_token_id\": 0,\n",
       "  \"pooler_fc_size\": 768,\n",
       "  \"pooler_num_attention_heads\": 12,\n",
       "  \"pooler_num_fc_layers\": 3,\n",
       "  \"pooler_size_per_head\": 128,\n",
       "  \"pooler_type\": \"first_token_transform\",\n",
       "  \"position_embedding_type\": \"absolute\",\n",
       "  \"transformers_version\": \"4.47.0\",\n",
       "  \"type_vocab_size\": 2,\n",
       "  \"use_cache\": true,\n",
       "  \"vocab_size\": 21128\n",
       "}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 4
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "开始评估",
   "id": "6db0f4d8b19c11b5"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-12T03:53:51.408075Z",
     "start_time": "2024-12-12T03:53:50.858269Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import evaluate\n",
    "\n",
    "accuracy = evaluate.load(\"./metrics/accuracy\")\n",
    "f1_eval = evaluate.load(\"./metrics/f1\")"
   ],
   "id": "f6b04b8d7f47828a",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "EvaluationModule(name: \"f1\", module_type: \"metric\", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: \"\"\"\n",
       "Args:\n",
       "    predictions (`list` of `int`): Predicted labels.\n",
       "    references (`list` of `int`): Ground truth labels.\n",
       "    labels (`list` of `int`): The set of labels to include when `average` is not set to `'binary'`, and the order of the labels if `average` is `None`. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class. Labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in `predictions` and `references` are used in sorted order. Defaults to None.\n",
       "    pos_label (`int`): The class to be considered the positive class, in the case where `average` is set to `binary`. Defaults to 1.\n",
       "    average (`string`): This parameter is required for multiclass/multilabel targets. If set to `None`, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data. Defaults to `'binary'`.\n",
       "\n",
       "        - 'binary': Only report results for the class specified by `pos_label`. This is applicable only if the classes found in `predictions` and `references` are binary.\n",
       "        - 'micro': Calculate metrics globally by counting the total true positives, false negatives and false positives.\n",
       "        - 'macro': Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.\n",
       "        - 'weighted': Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters `'macro'` to account for label imbalance. This option can result in an F-score that is not between precision and recall.\n",
       "        - 'samples': Calculate metrics for each instance, and find their average (only meaningful for multilabel classification).\n",
       "    sample_weight (`list` of `float`): Sample weights Defaults to None.\n",
       "\n",
       "Returns:\n",
       "    f1 (`float` or `array` of `float`): F1 score or list of f1 scores, depending on the value passed to `average`. Minimum possible value is 0. Maximum possible value is 1. Higher f1 scores are better.\n",
       "\n",
       "Examples:\n",
       "\n",
       "    Example 1-A simple binary example\n",
       "        >>> f1_metric = evaluate.load(\"f1\")\n",
       "        >>> results = f1_metric.compute(references=[0, 1, 0, 1, 0], predictions=[0, 0, 1, 1, 0])\n",
       "        >>> print(results)\n",
       "        {'f1': 0.5}\n",
       "\n",
       "    Example 2-The same simple binary example as in Example 1, but with `pos_label` set to `0`.\n",
       "        >>> f1_metric = evaluate.load(\"f1\")\n",
       "        >>> results = f1_metric.compute(references=[0, 1, 0, 1, 0], predictions=[0, 0, 1, 1, 0], pos_label=0)\n",
       "        >>> print(round(results['f1'], 2))\n",
       "        0.67\n",
       "\n",
       "    Example 3-The same simple binary example as in Example 1, but with `sample_weight` included.\n",
       "        >>> f1_metric = evaluate.load(\"f1\")\n",
       "        >>> results = f1_metric.compute(references=[0, 1, 0, 1, 0], predictions=[0, 0, 1, 1, 0], sample_weight=[0.9, 0.5, 3.9, 1.2, 0.3])\n",
       "        >>> print(round(results['f1'], 2))\n",
       "        0.35\n",
       "\n",
       "    Example 4-A multiclass example, with different values for the `average` input.\n",
       "        >>> predictions = [0, 2, 1, 0, 0, 1]\n",
       "        >>> references = [0, 1, 2, 0, 1, 2]\n",
       "        >>> results = f1_metric.compute(predictions=predictions, references=references, average=\"macro\")\n",
       "        >>> print(round(results['f1'], 2))\n",
       "        0.27\n",
       "        >>> results = f1_metric.compute(predictions=predictions, references=references, average=\"micro\")\n",
       "        >>> print(round(results['f1'], 2))\n",
       "        0.33\n",
       "        >>> results = f1_metric.compute(predictions=predictions, references=references, average=\"weighted\")\n",
       "        >>> print(round(results['f1'], 2))\n",
       "        0.27\n",
       "        >>> results = f1_metric.compute(predictions=predictions, references=references, average=None)\n",
       "        >>> print(results)\n",
       "        {'f1': array([0.8, 0. , 0. ])}\n",
       "\n",
       "    Example 5-A multi-label example\n",
       "        >>> f1_metric = evaluate.load(\"f1\", \"multilabel\")\n",
       "        >>> results = f1_metric.compute(predictions=[[0, 1, 1], [1, 1, 0]], references=[[0, 1, 1], [0, 1, 0]], average=\"macro\")\n",
       "        >>> print(round(results['f1'], 2))\n",
       "        0.67\n",
       "\"\"\", stored examples: 0)"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 25
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-12T03:53:16.534004Z",
     "start_time": "2024-12-12T03:53:16.529287Z"
    }
   },
   "cell_type": "code",
   "source": [
    "def eval_metric(eval_predict):\n",
    "    predictions, labels = eval_predict\n",
    "    predictions = predictions.argmax(axis=-1)\n",
    "    acc = accuracy.compute(predictions=predictions, references=labels)\n",
    "    # f1 = f1_eval.compute(predictions=predictions, references=labels)\n",
    "    # acc.update(f1)\n",
    "    return acc"
   ],
   "id": "62fdf55772dc4f90",
   "outputs": [],
   "execution_count": 21
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-12T03:37:20.886990Z",
     "start_time": "2024-12-12T03:37:20.845024Z"
    }
   },
   "cell_type": "code",
   "source": [
    "train_args = TrainingArguments(output_dir=\"./checkpoints\",  # 输出文件夹\n",
    "                               per_device_train_batch_size=64,  # 训练时的batch_size\n",
    "                               per_device_eval_batch_size=128,  # 验证时的batch_size\n",
    "                               logging_steps=10,  # log 打印的频率\n",
    "                               evaluation_strategy=\"epoch\",  # 评估策略\n",
    "                               save_strategy=\"epoch\",  # 保存策略\n",
    "                               save_total_limit=3,  # 最大保存数\n",
    "                               learning_rate=2e-5,  # 学习率\n",
    "                               weight_decay=0.01,  # weight_decay\n",
    "                               metric_for_best_model=\"f1\",  # 设定评估指标\n",
    "                               load_best_model_at_end=True)  # 训练完成后加载最优模型\n",
    "train_args"
   ],
   "id": "9aa7df1884aaa14a",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\_job\\job.py\\model\\.venv\\Lib\\site-packages\\transformers\\training_args.py:1575: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "TrainingArguments(\n",
       "_n_gpu=0,\n",
       "accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},\n",
       "adafactor=False,\n",
       "adam_beta1=0.9,\n",
       "adam_beta2=0.999,\n",
       "adam_epsilon=1e-08,\n",
       "auto_find_batch_size=False,\n",
       "average_tokens_across_devices=False,\n",
       "batch_eval_metrics=False,\n",
       "bf16=False,\n",
       "bf16_full_eval=False,\n",
       "data_seed=None,\n",
       "dataloader_drop_last=False,\n",
       "dataloader_num_workers=0,\n",
       "dataloader_persistent_workers=False,\n",
       "dataloader_pin_memory=True,\n",
       "dataloader_prefetch_factor=None,\n",
       "ddp_backend=None,\n",
       "ddp_broadcast_buffers=None,\n",
       "ddp_bucket_cap_mb=None,\n",
       "ddp_find_unused_parameters=None,\n",
       "ddp_timeout=1800,\n",
       "debug=[],\n",
       "deepspeed=None,\n",
       "disable_tqdm=False,\n",
       "dispatch_batches=None,\n",
       "do_eval=True,\n",
       "do_predict=False,\n",
       "do_train=False,\n",
       "eval_accumulation_steps=None,\n",
       "eval_delay=0,\n",
       "eval_do_concat_batches=True,\n",
       "eval_on_start=False,\n",
       "eval_steps=None,\n",
       "eval_strategy=IntervalStrategy.EPOCH,\n",
       "eval_use_gather_object=False,\n",
       "evaluation_strategy=epoch,\n",
       "fp16=False,\n",
       "fp16_backend=auto,\n",
       "fp16_full_eval=False,\n",
       "fp16_opt_level=O1,\n",
       "fsdp=[],\n",
       "fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},\n",
       "fsdp_min_num_params=0,\n",
       "fsdp_transformer_layer_cls_to_wrap=None,\n",
       "full_determinism=False,\n",
       "gradient_accumulation_steps=1,\n",
       "gradient_checkpointing=False,\n",
       "gradient_checkpointing_kwargs=None,\n",
       "greater_is_better=True,\n",
       "group_by_length=False,\n",
       "half_precision_backend=auto,\n",
       "hub_always_push=False,\n",
       "hub_model_id=None,\n",
       "hub_private_repo=None,\n",
       "hub_strategy=HubStrategy.EVERY_SAVE,\n",
       "hub_token=<HUB_TOKEN>,\n",
       "ignore_data_skip=False,\n",
       "include_for_metrics=[],\n",
       "include_inputs_for_metrics=False,\n",
       "include_num_input_tokens_seen=False,\n",
       "include_tokens_per_second=False,\n",
       "jit_mode_eval=False,\n",
       "label_names=None,\n",
       "label_smoothing_factor=0.0,\n",
       "learning_rate=2e-05,\n",
       "length_column_name=length,\n",
       "load_best_model_at_end=True,\n",
       "local_rank=0,\n",
       "log_level=passive,\n",
       "log_level_replica=warning,\n",
       "log_on_each_node=True,\n",
       "logging_dir=./checkpoints\\runs\\Dec12_11-37-20_DESKTOP-8HQ99TG,\n",
       "logging_first_step=False,\n",
       "logging_nan_inf_filter=True,\n",
       "logging_steps=10,\n",
       "logging_strategy=IntervalStrategy.STEPS,\n",
       "lr_scheduler_kwargs={},\n",
       "lr_scheduler_type=SchedulerType.LINEAR,\n",
       "max_grad_norm=1.0,\n",
       "max_steps=-1,\n",
       "metric_for_best_model=f1,\n",
       "mp_parameters=,\n",
       "neftune_noise_alpha=None,\n",
       "no_cuda=False,\n",
       "num_train_epochs=3.0,\n",
       "optim=OptimizerNames.ADAMW_TORCH,\n",
       "optim_args=None,\n",
       "optim_target_modules=None,\n",
       "output_dir=./checkpoints,\n",
       "overwrite_output_dir=False,\n",
       "past_index=-1,\n",
       "per_device_eval_batch_size=128,\n",
       "per_device_train_batch_size=64,\n",
       "prediction_loss_only=False,\n",
       "push_to_hub=False,\n",
       "push_to_hub_model_id=None,\n",
       "push_to_hub_organization=None,\n",
       "push_to_hub_token=<PUSH_TO_HUB_TOKEN>,\n",
       "ray_scope=last,\n",
       "remove_unused_columns=True,\n",
       "report_to=[],\n",
       "restore_callback_states_from_checkpoint=False,\n",
       "resume_from_checkpoint=None,\n",
       "run_name=./checkpoints,\n",
       "save_on_each_node=False,\n",
       "save_only_model=False,\n",
       "save_safetensors=True,\n",
       "save_steps=500,\n",
       "save_strategy=SaveStrategy.EPOCH,\n",
       "save_total_limit=3,\n",
       "seed=42,\n",
       "skip_memory_metrics=True,\n",
       "split_batches=None,\n",
       "tf32=None,\n",
       "torch_compile=False,\n",
       "torch_compile_backend=None,\n",
       "torch_compile_mode=None,\n",
       "torch_empty_cache_steps=None,\n",
       "torchdynamo=None,\n",
       "tpu_metrics_debug=False,\n",
       "tpu_num_cores=None,\n",
       "use_cpu=False,\n",
       "use_ipex=False,\n",
       "use_legacy_prediction_loop=False,\n",
       "use_liger_kernel=False,\n",
       "use_mps_device=False,\n",
       "warmup_ratio=0.0,\n",
       "warmup_steps=0,\n",
       "weight_decay=0.01,\n",
       ")"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 7
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-12T03:53:20.070137Z",
     "start_time": "2024-12-12T03:53:20.054189Z"
    }
   },
   "cell_type": "code",
   "source": [
    "#创建Trainer\n",
    "\n",
    "from transformers import DataCollatorWithPadding\n",
    "trainer = Trainer(model=model, \n",
    "                  args=train_args, \n",
    "                  train_dataset=tokenized_datasets[\"train\"], \n",
    "                  eval_dataset=tokenized_datasets[\"test\"], \n",
    "                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),\n",
    "                  compute_metrics=eval_metric)"
   ],
   "id": "baf4bff8621733a4",
   "outputs": [],
   "execution_count": 22
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-12T03:53:25.296194Z",
     "start_time": "2024-12-12T03:53:21.792926Z"
    }
   },
   "cell_type": "code",
   "source": "trainer.train()",
   "id": "ef2fe79d5b6f10e6",
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "The model did not return a loss from the inputs, only the following keys: last_hidden_state,pooler_output. For reference, the inputs it received are input_ids,token_type_ids,attention_mask.",
     "output_type": "error",
     "traceback": [
      "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
      "\u001B[1;31mValueError\u001B[0m                                Traceback (most recent call last)",
      "Cell \u001B[1;32mIn[23], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[43mtrainer\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mtrain\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n",
      "File \u001B[1;32mD:\\_job\\job.py\\model\\.venv\\Lib\\site-packages\\transformers\\trainer.py:2164\u001B[0m, in \u001B[0;36mTrainer.train\u001B[1;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001B[0m\n\u001B[0;32m   2162\u001B[0m         hf_hub_utils\u001B[38;5;241m.\u001B[39menable_progress_bars()\n\u001B[0;32m   2163\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m-> 2164\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43minner_training_loop\u001B[49m\u001B[43m(\u001B[49m\n\u001B[0;32m   2165\u001B[0m \u001B[43m        \u001B[49m\u001B[43margs\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m   2166\u001B[0m \u001B[43m        \u001B[49m\u001B[43mresume_from_checkpoint\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mresume_from_checkpoint\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m   2167\u001B[0m \u001B[43m        \u001B[49m\u001B[43mtrial\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtrial\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m   2168\u001B[0m \u001B[43m        \u001B[49m\u001B[43mignore_keys_for_eval\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mignore_keys_for_eval\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m   2169\u001B[0m \u001B[43m    \u001B[49m\u001B[43m)\u001B[49m\n",
      "File \u001B[1;32mD:\\_job\\job.py\\model\\.venv\\Lib\\site-packages\\transformers\\trainer.py:2522\u001B[0m, in \u001B[0;36mTrainer._inner_training_loop\u001B[1;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001B[0m\n\u001B[0;32m   2516\u001B[0m context \u001B[38;5;241m=\u001B[39m (\n\u001B[0;32m   2517\u001B[0m     functools\u001B[38;5;241m.\u001B[39mpartial(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39maccelerator\u001B[38;5;241m.\u001B[39mno_sync, model\u001B[38;5;241m=\u001B[39mmodel)\n\u001B[0;32m   2518\u001B[0m     \u001B[38;5;28;01mif\u001B[39;00m i \u001B[38;5;241m!=\u001B[39m \u001B[38;5;28mlen\u001B[39m(batch_samples) \u001B[38;5;241m-\u001B[39m \u001B[38;5;241m1\u001B[39m\n\u001B[0;32m   2519\u001B[0m     \u001B[38;5;28;01melse\u001B[39;00m contextlib\u001B[38;5;241m.\u001B[39mnullcontext\n\u001B[0;32m   2520\u001B[0m )\n\u001B[0;32m   2521\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m context():\n\u001B[1;32m-> 2522\u001B[0m     tr_loss_step \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mtraining_step\u001B[49m\u001B[43m(\u001B[49m\u001B[43mmodel\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43minputs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mnum_items_in_batch\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m   2524\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m (\n\u001B[0;32m   2525\u001B[0m     args\u001B[38;5;241m.\u001B[39mlogging_nan_inf_filter\n\u001B[0;32m   2526\u001B[0m     \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m is_torch_xla_available()\n\u001B[0;32m   2527\u001B[0m     \u001B[38;5;129;01mand\u001B[39;00m (torch\u001B[38;5;241m.\u001B[39misnan(tr_loss_step) \u001B[38;5;129;01mor\u001B[39;00m torch\u001B[38;5;241m.\u001B[39misinf(tr_loss_step))\n\u001B[0;32m   2528\u001B[0m ):\n\u001B[0;32m   2529\u001B[0m     \u001B[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001B[39;00m\n\u001B[0;32m   2530\u001B[0m     tr_loss \u001B[38;5;241m=\u001B[39m tr_loss \u001B[38;5;241m+\u001B[39m tr_loss \u001B[38;5;241m/\u001B[39m (\u001B[38;5;241m1\u001B[39m \u001B[38;5;241m+\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mstate\u001B[38;5;241m.\u001B[39mglobal_step \u001B[38;5;241m-\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_globalstep_last_logged)\n",
      "File \u001B[1;32mD:\\_job\\job.py\\model\\.venv\\Lib\\site-packages\\transformers\\trainer.py:3655\u001B[0m, in \u001B[0;36mTrainer.training_step\u001B[1;34m(self, model, inputs, num_items_in_batch)\u001B[0m\n\u001B[0;32m   3653\u001B[0m         loss \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcompute_loss(model, inputs)\n\u001B[0;32m   3654\u001B[0m     \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m-> 3655\u001B[0m         loss \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcompute_loss\u001B[49m\u001B[43m(\u001B[49m\u001B[43mmodel\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43minputs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mnum_items_in_batch\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mnum_items_in_batch\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m   3657\u001B[0m \u001B[38;5;28;01mdel\u001B[39;00m inputs\n\u001B[0;32m   3658\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m (\n\u001B[0;32m   3659\u001B[0m     \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39margs\u001B[38;5;241m.\u001B[39mtorch_empty_cache_steps \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[0;32m   3660\u001B[0m     \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mstate\u001B[38;5;241m.\u001B[39mglobal_step \u001B[38;5;241m%\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39margs\u001B[38;5;241m.\u001B[39mtorch_empty_cache_steps \u001B[38;5;241m==\u001B[39m \u001B[38;5;241m0\u001B[39m\n\u001B[0;32m   3661\u001B[0m ):\n",
      "File \u001B[1;32mD:\\_job\\job.py\\model\\.venv\\Lib\\site-packages\\transformers\\trainer.py:3730\u001B[0m, in \u001B[0;36mTrainer.compute_loss\u001B[1;34m(self, model, inputs, return_outputs, num_items_in_batch)\u001B[0m\n\u001B[0;32m   3728\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m   3729\u001B[0m     \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(outputs, \u001B[38;5;28mdict\u001B[39m) \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mloss\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m outputs:\n\u001B[1;32m-> 3730\u001B[0m         \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[0;32m   3731\u001B[0m             \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mThe model did not return a loss from the inputs, only the following keys: \u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m   3732\u001B[0m             \u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m,\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;241m.\u001B[39mjoin(outputs\u001B[38;5;241m.\u001B[39mkeys())\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m. For reference, the inputs it received are \u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m,\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;241m.\u001B[39mjoin(inputs\u001B[38;5;241m.\u001B[39mkeys())\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m.\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m   3733\u001B[0m         )\n\u001B[0;32m   3734\u001B[0m     \u001B[38;5;66;03m# We don't use .loss here since the model may return tuples instead of ModelOutput.\u001B[39;00m\n\u001B[0;32m   3735\u001B[0m     loss \u001B[38;5;241m=\u001B[39m outputs[\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mloss\u001B[39m\u001B[38;5;124m\"\u001B[39m] \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(outputs, \u001B[38;5;28mdict\u001B[39m) \u001B[38;5;28;01melse\u001B[39;00m outputs[\u001B[38;5;241m0\u001B[39m]\n",
      "\u001B[1;31mValueError\u001B[0m: The model did not return a loss from the inputs, only the following keys: last_hidden_state,pooler_output. For reference, the inputs it received are input_ids,token_type_ids,attention_mask."
     ]
    }
   ],
   "execution_count": 23
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
